# import relevant modules
import pandas as pd
pd.set_option('display.max_columns', None)
import glob
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import cluster
from sklearn.manifold import TSNE
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sys
sys.path.append('../scripts/')
from querysuggestion import concat_suggestions, vectorize_suggestions
from clustering import kmeans_suggestions, dbscan_suggestions
# set to *.csv to process all
#path_to_csv = '../../data/BTW17_Suggestions/BTW_COMPLETE/*.csv'
#file_list = glob.glob(path_to_csv)
#start = '2017-05-29'
#end = '2017-10-09'
#suggestions_df = concat_suggestions(file_list, start, end)
#print(f'daterange: {suggestions_df["date"].min()}, {suggestions_df["date"].max()}')
# save to parquet
#suggestions_df.to_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
suggestions_df = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
# tokenize suggestions
suggestions_df['tokens'] = suggestions_df['suggestion'].apply(lambda x: str(x).split(' '))
suggestions_df.head(3)
suggestions, vector_data = vectorize_suggestions(suggestions_df)
# tsne transformation for plotting in 2d
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(vector_data)
kmeans_scores = pd.DataFrame(data=kmeans_suggestions(X_tsne, 10, 100))
kmeans_scores.rename(columns={'calinski_harabasz_score':'Calinski Harabasz Score',
'silhouette_score':'Silhouette Score',
'num_cluster':'Anzahl Cluster'}, inplace=True)
fig = make_subplots(1, 2)
fig.add_trace(go.Scatter(x=kmeans_scores['Anzahl Cluster'], y=kmeans_scores['Silhouette Score'],
name='Silhouette Score',
line=dict(color='rgb(133, 92, 117)')), row=1, col=1)
fig.add_trace(go.Scatter(x=kmeans_scores['Anzahl Cluster'], y=kmeans_scores['Calinski Harabasz Score'],
name='Calinski Harabasz Score',
line=dict(color='rgb(217, 175, 107)')), row=1, col=2)
fig.update_yaxes(title_text='Silhouette Score', row=1, col=1)
fig.update_yaxes(title_text='Calinski Harabasz Score', row=1, col=2)
fig.update_xaxes(title_text='Anzahl Cluster', row=1, col=1)
fig.update_xaxes(title_text='Anzahl Cluster', row=1, col=2)
fig.update_layout(template='simple_white',
font=dict(family='Computer Modern', color='black', size=15))
fig.show()
%reload_ext autoreload
%autoreload 2
from clustering import dbscan_suggestions
dbscan_scores = pd.DataFrame(data=dbscan_suggestions(X_tsne))
dbscan_scores.rename(columns={'eps':'Maximale Distanz', 'silhouette_score':'Silhouette Score',
'calinski_harabasz_score': 'Calinski Harabasz Score',
'num_cluster':'Anzahl Cluster', 'num_noise':'Anzahl Rauschpunkte'}, inplace=True)
fig = make_subplots(1, 2)
fig.add_trace(go.Scatter(x=dbscan_scores['Maximale Distanz'], y=dbscan_scores['Silhouette Score'],
name='Silhouette Score',
line=dict(color='rgb(133, 92, 117)')), row=1, col=1)
fig.add_trace(go.Scatter(x=dbscan_scores['Maximale Distanz'], y=dbscan_scores['Calinski Harabasz Score'],
name='Calinski Harabasz Score',
line=dict(color='rgb(217, 175, 107)')), row=1, col=2)
fig.update_yaxes(title_text='Silhouette Score', row=1, col=1)
fig.update_yaxes(title_text='Calinski Harabasz Score', row=1, col=2)
fig.update_xaxes(title_text='Maximale Distanz', row=1, col=1)
fig.update_xaxes(title_text='Maximale Distanz', row=1, col=2)
fig.update_layout(template='simple_white',
font=dict(family='Computer Modern', color='black', size=15))
fig.show()
Aktuell: Entscheidung nach Calinski Harabasz Score für: kmeans mit 94 Clustern.
from sklearn import cluster, metrics
kmeans = cluster.KMeans(n_clusters=94)
kmeans.fit(X_tsne)
labels = kmeans.predict(X_tsne)
print(f'Silhouette Score: {metrics.silhouette_score(X_tsne, labels)}\nCalinski Harabasz Score: {metrics.calinski_harabasz_score(X_tsne, labels)}')
# save suggestions tokenized in list
suggestions = [x for x in suggestions if x]
# create output df and plot
output_df = pd.DataFrame(X_tsne, columns=['t-SNE(x)', 't-SNE(y)'])
output_df['suggestion'] = suggestions
output_df['cluster'] = labels
output_df.sort_values(by='cluster', inplace=True, ignore_index=True)
output_df['vector'] = [x for x in vector_data]
output_df['cluster'] = output_df['cluster'].apply(str)
# save output df
output_df.to_json('../../data/BTW17_Suggestions/suggestions/cluster.json')
output_df.rename(columns={'cluster':'Cluster', 'suggestion':'Suggestion'}, inplace=True)
fig = px.scatter(output_df, x='t-SNE(x)', y='t-SNE(y)', color='Cluster', hover_name='Suggestion',
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()